The goal of this notebook is to explore by different ways the companies' bankruptcy data, analyse the general behavior of data and all form of correlation between the variables and finally to know what variables are useless and have to be ejected.
import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
init_notebook_mode()
warnings.filterwarnings("ignore")
with open('../data/description.json', 'r') as f:
description = json.loads(f.read())
dirpath = Path('../data/extension/csv')
lstfile = list(dirpath.glob('*year.csv'))
lstfile
[WindowsPath('../data/extension/csv/1year.csv'),
WindowsPath('../data/extension/csv/2year.csv'),
WindowsPath('../data/extension/csv/3year.csv'),
WindowsPath('../data/extension/csv/4year.csv'),
WindowsPath('../data/extension/csv/5year.csv')]
dataset = {i.stem:pd.read_csv(i) for i in lstfile}
shapes = {x:y.shape for x, y in dataset.items()}
shapes
{'1year': (7026, 65),
'2year': (10172, 65),
'3year': (10502, 65),
'4year': (9791, 65),
'5year': (5909, 65)}
plt.figure(figsize=(12,6))
plt.title('Total number of companies in each dataset', fontweight='bold')
ax = plt.bar(shapes.keys(), [s[0] for s in shapes.values()],
color=['darkred' if s[0] < 10000 else 'darkblue' for s in shapes.values()])
bankruptcy = pd.concat([x.X65.value_counts() for x in dataset.values()], axis=1, ignore_index=True)
bankruptcy.columns = dataset.keys()
bankruptcy.index = ['no bankrupt', 'bankrupt']
bankruptcy
| 1year | 2year | 3year | 4year | 5year | |
|---|---|---|---|---|---|
| no bankrupt | 6755 | 9772 | 10007 | 9276 | 5499 |
| bankrupt | 271 | 400 | 495 | 515 | 410 |
ax = bankruptcy.T.plot.bar(log=True, color=['lime','crimson'], figsize=(12,6), title='Bankrupt companies')
for p in ax.patches:
ax.annotate(p.get_height(), (p.get_x() + p.get_width() / 2., 10), ha = 'center', va = 'center')
In these datasets, the missing values are depicted by a "?"
missing_val = {k:(v == '?').sum().sum() for k,v in dataset.items()}
freq = {k:(v == '?').sum().sum() / np.multiply(*v.shape) for k,v in dataset.items()}
f, ax = plt.subplots(1,2, figsize=(15,6))
f.suptitle('Missing values')
ax[0].barh(list(missing_val.keys()), missing_val.values(), color='purple')
ax[0].set_title('Total number')
ax[1].barh(list(freq.keys()), freq.values(), color='orange')
ax[1].set_title('Frequency')
Text(0.5, 1.0, 'Frequency')
n = 3
pd.options.display.max_rows = 100
missing = (dataset[f'{n}year'] == '?')
color = ['r' if x > 100 else ('orange' if x > 10 else 'limegreen') for x in missing.sum(axis=0)]
missing.sum(axis=0).plot(kind='bar', figsize=(15,5), log=True, color=color)
plt.title(f'Number of missing values per variable in the {n}th dataset')
Text(0.5, 1.0, 'Number of missing values per variable in the 3th dataset')
missing_col = pd.DataFrame({'missing values': missing.sum(axis=0).values},
index = missing.columns.values)
missing_col = np.array([(x,', '.join(y.index.values))
for x,y in missing_col.groupby('missing values')])
missing_col = pd.DataFrame({'Number of missing values':missing_col[:,0].astype(int)},
index=missing_col[:,1])
missing_col.sort_values('Number of missing values', ascending=False)\
.style.background_gradient(cmap='Blues', vmax=500, vmin=20, low=0.1, high=0.9)
| Number of missing values | |
|---|---|
| X37 | 4736 |
| X21 | 807 |
| X27 | 715 |
| X60 | 592 |
| X45 | 591 |
| X28, X53, X54, X64 | 228 |
| X24 | 227 |
| X41 | 202 |
| X32 | 101 |
| X47, X52 | 86 |
| X13, X19, X20, X23, X30, X31, X39, X42, X43, X44, X49, X56, X62 | 43 |
| X58 | 29 |
| X5 | 25 |
| X4, X12, X33, X40, X46, X63 | 18 |
| X61 | 17 |
| X8, X16, X17, X26, X34, X50 | 14 |
| X15 | 8 |
| X9 | 3 |
| X1, X2, X3, X6, X7, X10, X11, X14, X18, X22, X25, X29, X35, X36, X38, X48, X51, X55, X57, X59, X65 | 0 |
missing.sum(axis=1).sort_values(ascending=False).head(50).plot(kind='bar', figsize=(15,5))
plt.title('Max number of missing values per row')
Text(0.5, 1.0, 'Max number of missing values per row')
useless_col = missing.sum(axis=0)[missing.sum(axis=0) >= 50]
useless_col.name = 'Useless variables'
useless_col
X21 807 X24 227 X27 715 X28 228 X32 101 X37 4736 X41 202 X45 591 X47 86 X52 86 X53 228 X54 228 X60 592 X64 228 Name: Useless variables, dtype: int64
missing.drop(columns=useless_col.index, inplace=True)
useless_row = missing.sum(axis=1)[missing.sum(axis=1) >= 5]
useless_row.name = 'Useless companies'
len(useless_row)
58
missing.drop(index=useless_row.index, inplace=True)
print("old dataset : ", (dataset[f'{n}year'] == '?').sum().sum(), "omitted\nshape :", (dataset[f'{n}year'] == '?').shape)
print("dataset cleaned : ", missing.sum().sum(), "omitted\nshape :", missing.shape)
old dataset : 9888 omitted shape : (10502, 65) dataset cleaned : 36 omitted shape : (10444, 51)
Dataset of the nth year with far fewer missing values
data = dataset[f'{n}year'].copy()
data.drop(columns=useless_col.index, inplace=True)
data.drop(index=useless_row.index, inplace=True)
Replace the last omitted values by the mean
for col in data:
data.loc[data[col] == '?', col] = data.loc[data[col] != '?', col].astype(float).mean()
data = data.astype(float)
data.X65 = data.X65.astype('category')
(data == '?').sum().sum()
0
plt.figure(figsize=(12,6))
fig = make_subplots(2, 10, horizontal_spacing=0.02)
idx = np.array([(x,y) for x in range(1,3) for y in range(1,11)])
predictors = ['safe' if x == 0 else 'bankrupt' for x in data['X65'].values]
for i, x in enumerate(data.drop('X65', axis=1).columns[0:20]):
row, col = idx[i]
fig.append_trace(go.Box(y=data[x], x=predictors, name=x), row=row, col=col)
y_min, y_max = data[x].sort_values().iloc[30], data[x].sort_values().iloc[-30]
fig.update_yaxes(range=(y_min, y_max), row=row, col=col)
fig.update_xaxes(title=x, row=row, col=col)
fig.update_layout(height=800, width=2000)
<Figure size 864x432 with 0 Axes>
cor_mat = data.corr()
plt.figure(figsize=(60,15))
plt.title('Correlation plot', fontweight='bold')
mask = np.triu(np.ones_like(cor_mat, dtype=bool))
sns.heatmap(cor_mat, square=True, annot=True, cmap='seismic', mask=mask, fmt='.1f')
<AxesSubplot:title={'center':'Correlation plot'}>
From this correlation plot and the analysis of the variable to explain, we can see that multiple variables are highly correlated, or can be avoided due to their variation, so we could ejected some of them without loss of information, in order to improve the performances of our future ML models. We will now check which are the highly correlated variables
cor_mat = data.corr()
mask = np.triu(np.ones_like(cor_mat), k=1).T
cor_mat = cor_mat * mask
hightly_cor_var = []
for col in cor_mat:
hightly_cor_var.append([col, *(cor_mat[col][cor_mat[col].abs() > 0.9]).index.values])
hightly_cor_var = [x for x in hightly_cor_var if len(x) > 1]
hightly_cor_var
[['X1', 'X7', 'X11', 'X14'], ['X2', 'X3', 'X6', 'X10', 'X25', 'X38', 'X51'], ['X3', 'X6', 'X10', 'X25', 'X38', 'X51'], ['X4', 'X40', 'X46'], ['X6', 'X10', 'X25', 'X38', 'X51'], ['X7', 'X11', 'X14'], ['X8', 'X17'], ['X10', 'X25', 'X38', 'X51'], ['X11', 'X14'], ['X16', 'X26'], ['X19', 'X23', 'X31', 'X42', 'X49'], ['X22', 'X35'], ['X23', 'X31', 'X42', 'X49'], ['X25', 'X38', 'X51'], ['X30', 'X62'], ['X31', 'X42', 'X49'], ['X38', 'X51'], ['X40', 'X46'], ['X42', 'X49'], ['X43', 'X44'], ['X56', 'X58']]
Remember the correponding description ?
l = []
for x in hightly_cor_var:
l.extend(x)
print('\n'.join(k + '\t=>\t' + v for k,v in description.items() if k in set(l)))
X1 => net profit / total assets X2 => total liabilities / total assets X3 => working capital / total assets X4 => current assets / short-term liabilities X6 => retained earnings / total assets X7 => EBIT / total assets X8 => book value of equity / total liabilities X10 => equity / total assets X11 => (gross profit + extraordinary items + financial expenses) / total assets X14 => (gross profit + interest) / total assets X16 => (gross profit + depreciation) / total liabilities X17 => total assets / total liabilities X19 => gross profit / sales X22 => profit on operating activities / total assets X23 => net profit / sales X25 => (equity - share capital) / total assets X26 => (net profit + depreciation) / total liabilities X30 => (total liabilities - cash) / sales X31 => (gross profit + interest) / sales X35 => profit on sales / total assets X38 => constant capital / total assets X40 => (current assets - inventory - receivables) / short-term liabilities X42 => profit on operating activities / sales X43 => rotation receivables + inventory turnover in days X44 => (receivables * 365) / sales X46 => (current assets - inventory) / short-term liabilities X49 => EBITDA (profit on operating activities - depreciation) / sales X51 => short-term liabilities / total assets X56 => (sales - cost of products sold) / sales X58 => total costs / total sales X62 => (short-term liabilities *365) / sales
So for instance, we could say that X1 and X7 which are respectively the $ \frac{net\:profit}{total\:assets}$ and the $\frac{EBIT}{total\:assets}$ are highly correlated, it makes sense. Let's show some of correlated variables :
f, ax = plt.subplots(3,3, figsize=(20,20))
sns.scatterplot(data.X1, data.X7, ax=ax[0,0])
sns.scatterplot(data.X2, data.X3, ax=ax[0,1])\
.set(xlim=(-10,15), ylim=(-15,5), xlabel=description['X2'], ylabel=description['X3'])
sns.scatterplot(data.X23, data.X31, ax=ax[0,2])\
.set(xlim=(-50,50), ylim=(-50,50), xlabel=description['X23'], ylabel=description['X31'])
sns.scatterplot(data.X8, data.X17, ax=ax[1,0])\
.set(xlim=(-500,1000), ylim=(-500,1000), xlabel=description['X8'], ylabel=description['X17'])
sns.scatterplot(data.X43, data.X44, ax=ax[1,1])\
.set(xlim=(-5000,10000), ylim=(-5000,10000), xlabel=description['X43'], ylabel=description['X44'])
sns.scatterplot(data.X11, data.X14, ax=ax[1,2])\
.set(xlim=(-10,10), ylim=(-10,10), xlabel=description['X11'], ylabel=description['X14'])
sns.scatterplot(data.X16, data.X26, ax=ax[2,0])\
.set(xlim=(-100,100), ylim=(-100,100), xlabel=description['X16'], ylabel=description['X26'])
sns.scatterplot(data.X22, data.X35, ax=ax[2,1])\
.set(xlim=(-10,10), ylim=(-10,10), xlabel=description['X22'], ylabel=description['X35'])
sns.scatterplot(data.X56, data.X58, ax=ax[2,2])\
.set(xlim=(-10,10), ylim=(-10,10), xlabel=description['X56'], ylabel=description['X58'])
[(-10.0, 10.0), (-10.0, 10.0), Text(0.5, 0, '(sales - cost of products sold) / sales'), Text(0, 0.5, 'total costs / total sales')]
And now, let's realise a PCA to analyse which variables are the most efficients
We start by standardize the data matrix and then apply a PCA
ss_data = StandardScaler().fit_transform(data.drop('X65', axis=1))
pca = PCA()
pc_data = pca.fit_transform(ss_data)
To see how many variables will really be needed to the future model, the fewer variables there are, the more our model will be easy to train and fast
f = plt.figure(figsize=(25,12))
pcs = [f'C{i}' for i in range(1, ss_data.shape[1]+1)]
cumul = pca.explained_variance_ratio_.cumsum()
cumul70 = np.where(cumul > 0.7)[0][0]
cumul99 = np.where(cumul > 0.99)[0][0]
sns.barplot(pcs, cumul)
sns.lineplot(pcs, cumul, marker='o', ms=10)
plt.axvline(cumul70, 0, 1, c='orange', lw=6, ls=':')
plt.text(cumul70 + 0.5, 0.1, 'Upper to 70%', size=30)
plt.axvline(cumul99, 0, 1, c='lime', lw=6, ls=':')
plt.text(cumul99 + 0.5, 0.1, 'Upper to 99%', size=30)
plt.title('Cumulative explained variance in percentage', dict(fontsize=20, fontweight='bold'))
plt.xlabel('Principal components')
ax = f.add_axes([0.55, 0.4, 0.27, 0.4], title='Explained variance ratio - eigenvalues')
ax.get_xaxis().set_visible(False)
sns.barplot(pcs, pca.explained_variance_ratio_, log=True)
<matplotlib.axes._axes.Axes at 0x258481b3250>
It is shown here that only 23 variables would really be useful to represent the total information in this dataset, and this is due to a lot of redundancy in the variables.
Here, we won't show the graph of the individuals because of the size of the dataset, so we will only show the graph of variables
eig_vecs = pca.components_[:, 0:2]
arrows = []
for i in range(pca.n_features_):
arrows.append(
go.layout.Annotation(dict(
x= eig_vecs[i,0], y= eig_vecs[i,1], ax= 0, ay= 0,
xref="x", yref="y", axref = "x", ayref='y',
showarrow=True, arrowhead = 3, arrowwidth=1.5, arrowcolor='red'))
)
arrows.append(
go.layout.Annotation(dict(
x= eig_vecs[i,0], y= eig_vecs[i,1],
showarrow=False, text=data.columns[i]))
)
f = go.Figure(go.Scatter())
f.update_layout(title='Loadings plot',annotations=arrows)
It's quite difficult to interpret this type of graph, it is confusing. However, we can see immediately that almost all variables are misrepresented because the arrow doesn't reach the boundary of a unit circle
%%javascript
IPython.notebook.save_notebook()
!jupyter nbconvert --to html visualisation.ipynb
[NbConvertApp] Converting notebook visualisation.ipynb to html [NbConvertApp] Writing 8302458 bytes to visualisation.html